knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
set.seed(119)
#library(conflicted)
#pacman::p_depends(microbiomeMarker, local = TRUE)
#pacman::p_depends_reverse(microbiomeMarker, local = TRUE)
library(phyloseq); packageVersion("phyloseq")
library(Biostrings); packageVersion("Biostrings")
pacman::p_load(tidyverse, patchwork, ampvis2,
agricolae, labdsv, naniar, ape,
pairwiseAdonis, microbiome, seqRFLP, microbiomeMarker,
microbiomeMarker, reactable, downloadthis, captioner,
install = FALSE, update = FALSE)
options(scipen=999)
knitr::opts_current$get(c(
"cache",
"cache.path",
"cache.rebuild",
"dependson",
"autodep"
))Click here for setup information.
Synopsis
This workflow contains ASV differential abundance analysis for the high temperature data sets. In order to run the workflow, you either need to first run the Data Preparation workflow and the Filtering workflow or or download the files linked below. See the Data Availability page for complete details.
Workflow Input
Files needed to run this workflow can be downloaded from figshare.
In this workflow, we use the labdsv package (Roberts and Roberts 2016) to run Dufrene-Legendre Indicator Species Analysis and the microbiomeMarker package (Cao 2020) to run linear discriminant analysis (LDA) effect size (LEfSe) (Segata et al. 2011).
16s rRNA
Indicator Analysis
Indicator Analysis calculates the indicator value (fidelity and relative abundance) of species in clusters or types.
- Choose a data set(s) & set the p-value cutoff.
Code
samp_ps <- c("ssu18_ps_work", "ssu18_ps_filt",
"ssu18_ps_perfect", "ssu18_ps_pime")
indval_pval <- 0.05- Format Data Frame
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_df <- data.frame(otu_table(tmp_get))
tmp_df <- tmp_df[, which(colSums(tmp_df) != 0)]
tmp_row_names <- row.names(tmp_df)
tmp_row_names <- tmp_row_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
tmp_df <- tmp_df %>% tibble::add_column(tmp_row_names, .before = 1)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_seq_tab"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
objects()- Run Indicator Analysis
Code
set.seed(1191)
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
tmp_iva <- indval(tmp_get[,-1], tmp_get[,1])
tmp_gr <- tmp_iva$maxcls[tmp_iva$pval <= indval_pval]
tmp_iv <- tmp_iva$indcls[tmp_iva$pval <= indval_pval]
tmp_pv <- tmp_iva$pval[tmp_iva$pval <= indval_pval]
tmp_fr <- apply(tmp_get[,-1] > 0, 2, sum)[tmp_iva$pval <= indval_pval]
tmp_sum <- data.frame(group = tmp_gr, indval = tmp_iv,
pval = tmp_pv, freq = tmp_fr)
tmp_sum <- tmp_sum[order(tmp_sum$group, -tmp_sum$indval),]
tmp_tax_df <- data.frame(tax_table(get(i)))
tmp_tax_df$ASV_ID <- NULL
tmp_sum_tax <- merge(tmp_sum, tmp_tax_df, by = "row.names", all = TRUE)
tmp_sum_tax <- tmp_sum_tax[!(is.na(tmp_sum_tax$group)),]
class(tmp_sum_tax$group) <- "character"
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^1$", "C0")
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^2$", "W3")
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^3$", "W8")
tmp_sum_tax <- tmp_sum_tax %>% dplyr::rename("ASV_ID" = "Row.names")
tmp_sum_tax <- tmp_sum_tax[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum_tax$ASV_ID))),]
tmp_sum_tax$ASV_ID <- as.character(tmp_sum_tax$ASV_ID)
tmp_sum.prob.corrected <- p.adjust(tmp_sum$pval, "bonferroni")
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_indval_summary"))
assign(tmp_res_name, tmp_sum_tax)
rm(list = ls(pattern = "tmp_"))
}
objects()Now we can save a few files and display the data.
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
tmp_get[,1] <- NULL
tmp_df <- as.data.frame(t(tmp_get))
tmp_col_names <- colnames(tmp_df)
tmp_col_names <- tmp_col_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
colnames(tmp_df) <- tmp_col_names
#tmp_df$freq_all <- apply(tmp_df > 0, 1, sum)
tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")
tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
tmp_merge_df <- merge(tmp_df, tmp_get_indval, by = "ASV_ID", all = FALSE)
tmp_merge_df <- tmp_merge_df[,c(1,9:12,2:8,13:19)]
tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_indval_final"))
assign(tmp_merge_name, tmp_merge_df)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_indval_summary")- Save a new phyloseq object
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
tmp_list <- tmp_tab[,1]
tmp_ps <- prune_taxa(tmp_list, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_ind"))
assign(tmp_name, tmp_ps)
tmp_ps@phy_tree <- NULL
tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
tip.label = taxa_names(tmp_ps))
tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
assign(tmp_name, tmp_ps)
rm(list = ls(pattern = "tmp_"))
}
objects()Indicator Summary
LEfSe Analysis
From (Segata et al. 2011).
Linear discriminant analysis (LDA) effect size (LEfSe) is a method to support high-dimensional class comparisons with a particular focus on microbial community analyses. LEfSe determines the features (organisms, clades, operational taxonomic units, genes, or functions) most likely to explain differences between classes by coupling standard tests for statistical significance with additional tests encoding biological consistency and effect relevance. Class comparison methods typically predict biomarkers consisting of features that violate a null hypothesis of no difference between classes. The effect size provides an estimation of the magnitude of the observed phenomenon due to each characterizing feature and it is thus a valuable tool for ranking the relevance of different biological aspects and for addressing further investigations and analyses.
First, a little data tidying and subsetting only ASV data from the taxonomy table.
Code
for (i in samp_ps) {
tmp_get <- get(i)
tax_table(tmp_get) <- tax_table(tmp_get)[,c(1:6)]
tax_table(tmp_get) <- cbind(tax_table(tmp_get),
rownames(tax_table(tmp_get)))
colnames(tax_table(tmp_get)) <- c("Kingdom", "Phylum", "Class", "Order",
"Family", "Genus", "Species")
tmp_name <- purrr::map_chr(i, ~ paste0(., "_rf_all"))
assign(tmp_name, tmp_get)
tax_table(tmp_get) <- tax_table(tmp_get)[,c(7)]
tmp_name_asv <- purrr::map_chr(i, ~ paste0(., "_rf_asv"))
assign(tmp_name_asv, tmp_get)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_rf")- Choose a data set(s) & set the LEfSe cutoff values.
samp_ps <- c("ssu18_ps_work", "ssu18_ps_filt", "ssu18_ps_perfect", "ssu18_ps_pime")
lefse_lda <- 2
lefse_kw <- 0.05
lefse_wc <- 0.05Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_asv")))
tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2",
lda_cutoff = lefse_lda,
kw_cutoff = lefse_kw,
wilcoxon_cutoff = lefse_wc,
bootstrap_n = 30, bootstrap_fraction = 2/3,
sample_min = 10, multicls_strat = FALSE, curv = FALSE)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv"))
assign(tmp_name, tmp_lefse)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_all")))
tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2",
lda_cutoff = lefse_lda,
kw_cutoff = lefse_kw,
wilcoxon_cutoff = lefse_wc,
bootstrap_n = 30, bootstrap_fraction = 2/3,
sample_min = 10, multicls_strat = FALSE, curv = FALSE)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all"))
assign(tmp_name, tmp_lefse)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
objects(pattern = "_lefse_all")Code
for (i in samp_ps) {
tmp_o_tax <- data.frame(tax_table(get(i)))
tmp_o_tax$ASV_ID <- NULL
tmp_o_tax <- tmp_o_tax %>% tibble::rownames_to_column("ASV_ID")
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt$feature <- gsub('s__', '', tmp_mt$feature)
tmp_mt <- tmp_mt %>% dplyr::rename(c("ASV_ID" = 1, "group" = 2, "pval" = 4)) %>%
tibble::remove_rownames()
tmp_sum <- dplyr::left_join(tmp_mt, tmp_o_tax, by = "ASV_ID")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^0$", "C0")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^3$", "W3")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^8$", "W8")
tmp_sum <- tmp_sum[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum$ASV_ID))),]
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_summary"))
assign(tmp_res_name, tmp_sum)
rm(list = ls(pattern = "tmp_"))
}Now we can save a few files and display the data.
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_df <- data.frame(t(otu_table(tmp_get)))
#tmp_get[,1] <- NULL
#tmp_df <- as.data.frame(t(tmp_otu))
tmp_col_names <- colnames(tmp_df)
tmp_col_names <- tmp_col_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
colnames(tmp_df) <- tmp_col_names
tmp_df$freq <- apply(tmp_df > 0, 1, sum)
tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")
tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
tmp_merge_df <- merge(tmp_df, tmp_get_lefse, by = "ASV_ID", all = FALSE)
tmp_merge_df <- tmp_merge_df[,c(1,10:12,2:9,14:20)]
tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_lefse_final"))
assign(tmp_merge_name, tmp_merge_df)
rm(list = ls(pattern = "tmp_"))
}Save a new phyloseq object
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
tmp_list <- tmp_tab[,1]
tmp_ps <- prune_taxa(tmp_list, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse"))
assign(tmp_name, tmp_ps)
tmp_ps@phy_tree <- NULL
tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
tip.label = taxa_names(tmp_ps))
tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
assign(tmp_name, tmp_ps)
rm(list = ls(pattern = "tmp_"))
}
objects()LEfSe Summaries (ASVs & Markers)
ASV summary data
ASV Visualizations
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt %>% filter(ef_lda >= 3)
tmp_mt <- marker_table(tmp_mt)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv_viz_trim"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv_viz_trim")
(16S rRNA) Figure 1 | Differentially abundant ASVs from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 3.0.

(16S rRNA) Figure 2 | Differentially abundant ASVs from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 3.0.

(16S rRNA) Figure 3 | Differentially abundant ASVs from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 3.0.

(16S rRNA) Figure 4 | Differentially abundant ASVs from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 3.0.
Marker summary data
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt[, c(2:4,1)]
tmp_mt <- tmp_mt %>% dplyr::rename(c("group" = 1, "pval" = 3))
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^0$", "C0")
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^3$", "W3")
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^8$", "W8")
tmp_mt <- tmp_mt %>% tibble::rownames_to_column("marker")
tmp_mt <- tmp_mt %>% tidyr::separate(col = "feature",
into = c("Kingdom", "Phylum", "Class",
"Order", "Family", "Genus", "ASV_ID"),
sep = "\\|\\w__")
tmp_mt$Kingdom <- gsub('k__', '', tmp_mt$Kingdom)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_marker_final"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_marker_final")(16S rRNA) Table 10 | Significant results of LEfSe MARKER Analysis for Arbitrary filtered ASV data set.
Marker visualizations
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt %>% filter(ef_lda >= 4)
tmp_mt <- marker_table(tmp_mt)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all_viz_trim"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_all_viz_trim")
(16S rRNA) Figure 5 | Differentially abundant MARKER from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 4.0.

(16S rRNA) Figure 6 | Differentially abundant MARKER from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 4.0.

(16S rRNA) Figure 7 | Differentially abundant MARKER from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 4.0.

(16S rRNA) Figure 8 | Differentially abundant MARKER from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 4.0.
Response of select taxa
In this section of the workflow we use the microbiomeMarker package to assess the response of taxonomic lineages to soil warming. In the first step we need to fix the selected data set to make it compatible with the various functions. For this analysis we use the PERfect filtered data set.
Code
## FIX ps object
ssu_ps <- ssu18_ps_perfect_rf_all
tmp_tax1 <- data.frame(tax_table(ssu_ps))
#tmp_tax1$ASV_SEQ <- NULL
tmp_rn <- row.names(tmp_tax1)
tmp_tax <- data.frame(lapply(tmp_tax1, function(x) {gsub("\\(|)", "", x)}))
row.names(tmp_tax) <- tmp_rn
identical(row.names(tmp_tax), row.names(tmp_tax1))
#dplyr::filter(tmp_tax, Phylum == "Acidobacteriota")
ps_tax_new <- as.matrix(tmp_tax)
tmp_ps <- phyloseq(otu_table(ssu_ps),
phy_tree(ssu_ps),
tax_table(ps_tax_new),
sample_data(ssu_ps))
ssu_ps <- tmp_ps
phyloseq::rank_names(ssu_ps)
data.frame(tax_table(ssu_ps))Next we run a statistical test for multiple groups using the run_test_multiple_groups function.
Code
ssu_group_anova <- run_test_multiple_groups(ssu_ps, group = "TEMP",
taxa_rank = "all",
method = "anova")
ssu_group_anova@marker_table
marker_table(ssu_group_anova)And then conduct ost hoc pairwise comparisons for multiple groups test using the run_posthoc_test function.
Code
ssu_default_pht <- run_posthoc_test(ssu_ps, group = "TEMP",
method = "tukey", transform = "log10")We can filter out a select taxa and plot the results.
filter(data.frame(ssu_default_pht@result), group_name == "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales") group group_name
3-0.140 141 k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales
8-0.140 141 k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales
8-3.140 141 k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales
comparions diff_mean pvalue ci_lower ci_upper
3-0.140 3-0 0.007492377 0.645933374 -0.0145296109 0.02951437
8-0.140 8-0 0.030358682 0.008215796 0.0083366941 0.05238067
8-3.140 8-3 0.022866305 0.041743700 0.0008443168 0.04488829
plot_postHocTest(ssu_default_pht, feature = "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales") & theme_bw()
But what we really want to do is get all of the markers that are significant from the analysis, excluding any significant ASVs so we can look at high taxa ranks.
Code
ssu_pht <- ssu_default_pht
ssu18_pht_filt <- filter(data.frame(ssu_pht@result), pvalue <= "0.05")[!grepl("ASV", filter(data.frame(ssu_pht@result), pvalue <= "0.05")$group_name),]
ssu18_pht_filt <- ssu18_pht_filt[!grepl("[a-z]__$", ssu18_pht_filt$group_name),]
ssu18_pht_filt <- unique(ssu18_pht_filt$group_name)
length(ssu18_pht_filt)
save.image("page_build/mmarker_wf.rdata")There are 47 significantly different lineage markers.
Click here to see a list of significantly different lineage markers.
[1] "k__Archaea|p__Thermoplasmatota"
[2] "k__Bacteria|p__Actinobacteriota"
[3] "k__Bacteria|p__Bacteroidota"
[4] "k__Bacteria|p__Firmicutes"
[5] "k__Bacteria|p__MBNT15"
[6] "k__Bacteria|p__Myxococcota"
[7] "k__Archaea|p__Thermoplasmatota|c__Thermoplasmata"
[8] "k__Bacteria|p__Acidobacteriota|c__Subgroup_11"
[9] "k__Bacteria|p__Acidobacteriota|c__Subgroup_22"
[10] "k__Bacteria|p__Actinobacteriota|c__MB-A2-108"
[11] "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia"
[12] "k__Bacteria|p__Bacteroidota|c__Bacteroidia"
[13] "k__Bacteria|p__Firmicutes|c__Bacilli"
[14] "k__Bacteria|p__Myxococcota|c__bacteriap25"
[15] "k__Bacteria|p__Planctomycetota|c__Pla4_lineage"
[16] "k__Bacteria|p__Acidobacteriota|c__Acidobacteriae|o__Subgroup_2"
[17] "k__Bacteria|p__Actinobacteriota|c__Acidimicrobiia|o__Microtrichales"
[18] "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales"
[19] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Cytophagales"
[20] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Flavobacteriales"
[21] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Sphingobacteriales"
[22] "k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales"
[23] "k__Bacteria|p__Myxococcota|c__Polyangia|o__mle1-27"
[24] "k__Bacteria|p__Myxococcota|c__Polyangia|o__Polyangiales"
[25] "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales|f__Gaiellaceae"
[26] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Chitinophagales|f__Saprospiraceae"
[27] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Cytophagales|f__Hymenobacteraceae"
[28] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Flavobacteriales|f__Flavobacteriaceae"
[29] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Sphingobacteriales|f__AKYH767"
[30] "k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Planococcaceae"
[31] "k__Bacteria|p__Myxococcota|c__Polyangia|o__Polyangiales|f__BIrii41"
[32] "k__Bacteria|p__Myxococcota|c__Polyangia|o__Polyangiales|f__Polyangiaceae"
[33] "k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Hyphomonadaceae"
[34] "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales|f__Gaiellaceae|g__Gaiella"
[35] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Chitinophagales|f__Chitinophagaceae|g__Edaphobaculum"
[36] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Chitinophagales|f__Chitinophagaceae|g__UTBCD1"
[37] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Cytophagales|f__Hymenobacteraceae|g__Adhaeribacter"
[38] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Flavobacteriales|f__Flavobacteriaceae|g__Flavobacterium"
[39] "k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Planococcaceae|g__Chungangia"
[40] "k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Planococcaceae|g__Paenisporosarcina"
[41] "k__Bacteria|p__Myxococcota|c__Myxococcia|o__Myxococcales|f__Myxococcaceae|g__Corallococcus"
[42] "k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Hyphomonadaceae|g__SWB02"
[43] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Ralstonia"
[44] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Comamonadaceae|g__Ramlibacter"
[45] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Comamonadaceae|g__Rubrivivax"
[46] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Nitrosomonadaceae|g__mle1-7"
[47] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Arenimonas"
Now we select a subset of the 47 markers to plot and visualize the results.
Code
ssu_select <- c(
"k__Bacteria|p__Acidobacteriota|c__Acidobacteriae|o__Subgroup_2",
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Chitinophagales|f__Saprospiraceae",
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Cytophagales",
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Flavobacteriales",
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Sphingobacteriales",
"k__Bacteria|p__Myxococcota|c__Polyangia|o__mle1-27",
"k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Comamonadaceae|g__Rubrivivax",
"k__Bacteria|p__Actinobacteriota|c__Acidimicrobiia|o__Microtrichales",
"k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales",
"k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales",
"k__Bacteria|p__Myxococcota|c__Myxococcia|o__Myxococcales|f__Myxococcaceae|g__Corallococcus",
"k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Ralstonia"
)
(16S rRNA) Figure 9 | The response of select taxa to two years of warming by +3°C and +8°C. Differences assessed for multiple-group pair-wise comparisons using ANOVA followed by Tukey HSD post hoc tests. PERfect filtered read count data was log10 transformed and normalized using total sum scaling (TSS). The centre line of each box plot represents the median, the lower and upper hinges represent the first and third quartiles and whiskers represent + 1.5 the interquartile range. Significant differences denoted by asterisks (* p ≤ 0.05, ** p ≤ 0.01, *** p ≤ 0.001, **** p ≤ 0.0001; ns = not significant).
ITS
Indicator Analysis
- Choose a data set(s) & set the p-value cutoff.
samp_ps <- c("its18_ps_work", "its18_ps_filt", "its18_ps_perfect", "its18_ps_pime")
indval_pval <- 0.05- Format Data Frame
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_df <- data.frame(otu_table(tmp_get))
tmp_df <- tmp_df[, which(colSums(tmp_df) != 0)]
tmp_row_names <- row.names(tmp_df)
tmp_row_names <- tmp_row_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
tmp_df <- tmp_df %>% tibble::add_column(tmp_row_names, .before = 1)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_seq_tab"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
objects()- Run Indicator Analysis
Code
set.seed(1191)
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
tmp_iva <- indval(tmp_get[,-1], tmp_get[,1])
tmp_gr <- tmp_iva$maxcls[tmp_iva$pval <= indval_pval]
tmp_iv <- tmp_iva$indcls[tmp_iva$pval <= indval_pval]
tmp_pv <- tmp_iva$pval[tmp_iva$pval <= indval_pval]
tmp_fr <- apply(tmp_get[,-1] > 0, 2, sum)[tmp_iva$pval <= indval_pval]
tmp_sum <- data.frame(group = tmp_gr, indval = tmp_iv,
pval = tmp_pv, freq = tmp_fr)
tmp_sum <- tmp_sum[order(tmp_sum$group, -tmp_sum$indval),]
tmp_tax_df <- data.frame(tax_table(get(i)))
tmp_tax_df$ASV_ID <- NULL
tmp_sum_tax <- merge(tmp_sum, tmp_tax_df, by = "row.names", all = TRUE)
tmp_sum_tax <- tmp_sum_tax[!(is.na(tmp_sum_tax$group)),]
class(tmp_sum_tax$group) <- "character"
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^1$", "C0")
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^2$", "W3")
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^3$", "W8")
tmp_sum_tax <- tmp_sum_tax %>% dplyr::rename("ASV_ID" = "Row.names")
tmp_sum_tax <- tmp_sum_tax[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum_tax$ASV_ID))),]
tmp_sum_tax$ASV_ID <- as.character(tmp_sum_tax$ASV_ID)
tmp_sum.prob.corrected <- p.adjust(tmp_sum$pval, "bonferroni")
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_indval_summary"))
assign(tmp_res_name, tmp_sum_tax)
rm(list = ls(pattern = "tmp_"))
}Now we can save a few files and display the data.
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
tmp_get[,1] <- NULL
tmp_df <- as.data.frame(t(tmp_get))
tmp_col_names <- colnames(tmp_df)
tmp_col_names <- tmp_col_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
colnames(tmp_df) <- tmp_col_names
#tmp_df$freq_all <- apply(tmp_df > 0, 1, sum)
tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0",
"W3",
"W8"))])
tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")
tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
tmp_merge_df <- merge(tmp_df, tmp_get_indval, by = "ASV_ID", all = FALSE)
tmp_merge_df <- tmp_merge_df[,c(1,9:12,2:8,13:19)]
tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_indval_final"))
assign(tmp_merge_name, tmp_merge_df)
rm(list = ls(pattern = "tmp_"))
}- Save a new phyloseq object
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
tmp_list <- tmp_tab[,1]
tmp_ps <- prune_taxa(tmp_list, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_ind"))
assign(tmp_name, tmp_ps)
tmp_ps@phy_tree <- NULL
tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
tip.label = taxa_names(tmp_ps))
tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
assign(tmp_name, tmp_ps)
rm(list = ls(pattern = "tmp_"))
}
objects()Indicator Summary
LEfSe Analysis
Details about LEFSE
Remove ASV seq, subset only ASVs
Code
for (i in samp_ps) {
tmp_get <- get(i)
tax_table(tmp_get) <- tax_table(tmp_get)[,c(1:6)]
tax_table(tmp_get) <- cbind(tax_table(tmp_get),
rownames(tax_table(tmp_get)))
colnames(tax_table(tmp_get)) <- c("Kingdom", "Phylum", "Class", "Order",
"Family", "Genus", "Species")
tmp_name <- purrr::map_chr(i, ~ paste0(., "_rf_all"))
assign(tmp_name, tmp_get)
tax_table(tmp_get) <- tax_table(tmp_get)[,c(7)]
tmp_name_asv <- purrr::map_chr(i, ~ paste0(., "_rf_asv"))
assign(tmp_name_asv, tmp_get)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_rf")- Choose a data set(s) & set the LEfSe cutoff values.
samp_ps <- c("its18_ps_work", "its18_ps_filt", "its18_ps_perfect", "its18_ps_pime")
lefse_lda <- 2
lefse_kw <- 0.05
lefse_wc <- 0.05 Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_asv")))
tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2",
lda_cutoff = lefse_lda, kw_cutoff = lefse_kw, wilcoxon_cutoff = lefse_wc,
bootstrap_n = 30, bootstrap_fraction = 2/3,
sample_min = 10, multicls_strat = FALSE, curv = FALSE)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv"))
assign(tmp_name, tmp_lefse)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_all")))
tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2",
lda_cutoff = lefse_lda, kw_cutoff = lefse_kw, wilcoxon_cutoff = lefse_wc,
bootstrap_n = 30, bootstrap_fraction = 2/3,
sample_min = 10, multicls_strat = FALSE, curv = FALSE)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all"))
assign(tmp_name, tmp_lefse)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
objects(pattern = "_lefse_all")Code
for (i in samp_ps) {
tmp_o_tax <- data.frame(tax_table(get(i)))
tmp_o_tax$ASV_ID <- NULL
tmp_o_tax <- tmp_o_tax %>% tibble::rownames_to_column("ASV_ID")
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt$feature <- gsub('s__', '', tmp_mt$feature)
tmp_mt <- tmp_mt %>% dplyr::rename(c("ASV_ID" = 1, "group" = 2, "pval" = 4)) %>%
tibble::remove_rownames()
tmp_sum <- dplyr::left_join(tmp_mt, tmp_o_tax, by = "ASV_ID")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^0$", "C0")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^3$", "W3")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^8$", "W8")
tmp_sum <- tmp_sum[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum$ASV_ID))),]
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_summary"))
assign(tmp_res_name, tmp_sum)
rm(list = ls(pattern = "tmp_"))
}Now we can save a few files and display the data.
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_df <- data.frame(t(otu_table(tmp_get)))
#tmp_get[,1] <- NULL
#tmp_df <- as.data.frame(t(tmp_otu))
tmp_col_names <- colnames(tmp_df)
tmp_col_names <- tmp_col_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
colnames(tmp_df) <- tmp_col_names
tmp_df$freq <- apply(tmp_df > 0, 1, sum)
tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")
tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
tmp_merge_df <- merge(tmp_df, tmp_get_lefse, by = "ASV_ID", all = FALSE)
tmp_merge_df <- tmp_merge_df[,c(1,10:12,2:9,14:20)]
tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_lefse_final"))
assign(tmp_merge_name, tmp_merge_df)
rm(list = ls(pattern = "tmp_"))
}Save a new phyloseq object
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
tmp_list <- tmp_tab[,1]
tmp_ps <- prune_taxa(tmp_list, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse"))
assign(tmp_name, tmp_ps)
tmp_ps@phy_tree <- NULL
tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
tip.label = taxa_names(tmp_ps))
tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
assign(tmp_name, tmp_ps)
rm(list = ls(pattern = "tmp_"))
}
objects()LEfSe Summaries (ASVs & Markers)
ASV summary data
ASV Visualizations
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt %>% filter(ef_lda >= 3)
tmp_mt <- marker_table(tmp_mt)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv_viz_trim"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv_viz_trim")
(ITS) Figure 1 | Differentially abundant ASVs from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 3.0.

(ITS) Figure 2 | Differentially abundant ASVs from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 3.0.

(ITS) Figure 3 | Differentially abundant ASVs from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 3.0.

(ITS) Figure 4 | Differentially abundant ASVs from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 3.0.
Marker summary data
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt[, c(2:4,1)]
tmp_mt <- tmp_mt %>% dplyr::rename(c("group" = 1, "pval" = 3))
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^0$", "C0")
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^3$", "W3")
tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^8$", "W8")
tmp_mt <- tmp_mt %>% tibble::rownames_to_column("marker")
tmp_mt <- tmp_mt %>% tidyr::separate(col = "feature",
into = c("Kingdom", "Phylum", "Class",
"Order", "Family", "Genus", "ASV_ID"),
sep = "\\|\\w__")
tmp_mt$Kingdom <- gsub('k__', '', tmp_mt$Kingdom)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_marker_final"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}Marker visualizations
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt <- tmp_mt %>% filter(ef_lda >= 3.5)
tmp_mt <- marker_table(tmp_mt)
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all_viz_trim"))
assign(tmp_res_name, tmp_mt)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_all_viz_trim")
(ITS) Figure 5 | Differentially abundant MARKER from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 4.0.

(ITS) Figure 6 | Differentially abundant MARKER from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 4.0.

(ITS) Figure 7 | Differentially abundant MARKER from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 4.0.

(ITS) Figure 8 | Differentially abundant MARKER from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 4.0.
Response of select taxa
The workflow here is basically a carbon copy of the workflow describe above. Presented here for posterity.
Click here to see the ITS version of this workflow.
## FIX ps object
ps <- its18_ps_perfect_rf_all
tmp_tax1 <- data.frame(tax_table(ps))
tmp_tax1$ASV_SEQ <- NULL
tmp_rn <- row.names(tmp_tax1)
tmp_tax <- data.frame(lapply(tmp_tax1, function(x) {gsub("\\(|)", "", x)}))
row.names(tmp_tax) <- tmp_rn
identical(row.names(tmp_tax), row.names(tmp_tax1))
#dplyr::filter(tmp_tax, Phylum == "Acidobacteriota")
ps_tax_new <- as.matrix(tmp_tax)
tmp_ps <- phyloseq(otu_table(ps),
tax_table(ps_tax_new),
sample_data(ps))
its_ps <- tmp_ps
phyloseq::rank_names(its_ps)
data.frame(tax_table(its_ps))its_group_anova <- run_test_multiple_groups(its_ps, group = "TEMP",
taxa_rank = "all", method = "anova")
its_group_anova@marker_table
marker_table(its_group_anova)its_default_pht <- run_posthoc_test(its_ps,
group = "TEMP", method = "tukey",
transform = "log10")its_pht <- its_default_pht
its18_pht_filt <- filter(data.frame(its_pht@result), pvalue <= "0.05")[!grepl("ASV", filter(data.frame(its_pht@result), pvalue <= "0.05")$group_name),]
its18_pht_filt <- its18_pht_filt[!grepl("[a-z]__$", its18_pht_filt$group_name),]
its18_pht_filt <- unique(its18_pht_filt$group_name)
length(its18_pht_filt)There are 32 significantly different lineage markers.
Click here to see a list of significantly different lineage markers.
[1] "k__Fungi"
[2] "k__Fungi|p__Basidiomycota"
[3] "k__Fungi|p__Chytridiomycota"
[4] "k__Fungi|p__Glomeromycota"
[5] "k__Fungi|p__Mortierellomycota"
[6] "k__Fungi|p__Basidiomycota|c__Agaricomycetes"
[7] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes"
[8] "k__Fungi|p__Glomeromycota|c__Glomeromycetes"
[9] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes"
[10] "k__Fungi|p__Rozellomycota|c__Rozellomycotina_cls_Incertae_sedis"
[11] "k__Fungi|p__Basidiomycota|c__Agaricomycetes|o__Agaricales"
[12] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes|o__Sporidiobolales"
[13] "k__Fungi|p__Glomeromycota|c__Glomeromycetes|o__Glomerales"
[14] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes|o__Mortierellales"
[15] "k__Fungi|p__Rozellomycota|c__Rozellomycotina_cls_Incertae_sedis|o__GS11"
[16] "k__Fungi|p__Ascomycota|c__Eurotiomycetes|o__Eurotiales|f__Trichocomaceae"
[17] "k__Fungi|p__Ascomycota|c__Pezizomycetes|o__Pezizales|f__Pyronemataceae"
[18] "k__Fungi|p__Ascomycota|c__Saccharomycetes|o__Saccharomycetales|f__Metschnikowiaceae"
[19] "k__Fungi|p__Ascomycota|c__Sordariomycetes|o__Xylariales|f__Microdochiaceae"
[20] "k__Fungi|p__Basidiomycota|c__Agaricomycetes|o__Agaricales|f__Clavariaceae"
[21] "k__Fungi|p__Basidiomycota|c__Agaricomycetes|o__Agaricales|f__Entolomataceae"
[22] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes|o__Sporidiobolales|f__Sporidiobolaceae"
[23] "k__Fungi|p__Glomeromycota|c__Glomeromycetes|o__Glomerales|f__Glomeraceae"
[24] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes|o__Mortierellales|f__Mortierellaceae"
[25] "k__Fungi|p__Ascomycota|c__Eurotiomycetes|o__Eurotiales|f__Trichocomaceae|g__Talaromyces"
[26] "k__Fungi|p__Ascomycota|c__Pezizomycetes|o__Pezizales|f__Pyronemataceae|g__Scutellinia"
[27] "k__Fungi|p__Ascomycota|c__Saccharomycetes|o__Saccharomycetales|f__Metschnikowiaceae|g__Metschnikowia"
[28] "k__Fungi|p__Ascomycota|c__Sordariomycetes|o__Hypocreales|f__Nectriaceae|g__Fusarium"
[29] "k__Fungi|p__Ascomycota|c__Sordariomycetes|o__Xylariales|f__Microdochiaceae|g__Idriella"
[30] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes|o__Sporidiobolales|f__Sporidiobolaceae|g__Rhodotorula"
[31] "k__Fungi|p__Glomeromycota|c__Glomeromycetes|o__Glomerales|f__Glomeraceae|g__Kamienskia"
[32] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes|o__Mortierellales|f__Mortierellaceae|g__Mortierella"

(ITS) Figure 9 | The response of select taxa to two years of warming by +3°C and +8°C. Differences assessed for multiple-group pair-wise comparisons using ANOVA followed by Tukey HSD post hoc tests. PERfect filtered read count data was log10 transformed and normalized using total sum scaling (TSS). The centre line of each box plot represents the median, the lower and upper hinges represent the first and third quartiles and whiskers represent + 1.5 the interquartile range. Significant differences denoted by asterisks (* p ≤ 0.05, ** p ≤ 0.01, *** p ≤ 0.001, **** p ≤ 0.0001; ns = not significant).
Visualizing DA ASVs in Anvi’o
Next, we will combine the results of the ISA and LEfSe analyses with the distribution of ASVs across each sample. We are going to do the analysis in anvi’o—an advanced analysis and visualization platform for ’omics data (Eren et al. 2015)—using the anvi-interactive command. Anvi’o likes databases but it also understands that sometimes you do not have a database. So it offers a manual mode. If you type this command you can have a look at the relevant pieces we need for the visualization, specifically those under the headings MANUAL INPUTS and ADDITIONAL STUFF.
anvi-interactive -hMANUAL INPUTS:
Mandatory input parameters to start the interactive interface without
anvi'o databases.
--manual-mode We need this flag to run anvi'o in an ad hoc
manner, i.e., no database.
-f FASTA, --fasta-file FASTA
A FASTA-formatted input file. This is sort of
optional
-d VIEW_DATA, --view-data VIEW_DATA
A TAB-delimited file for view data. This is the ASV
by sample matrix. We need this
-t NEWICK, --tree NEWICK
NEWICK formatted tree structure. How the ASVs are
ordered in our case.
ADDITIONAL STUFF:
Parameters to provide additional layers, views, or layer data.
-V ADDITIONAL_VIEW, --additional-view ADDITIONAL_VIEW
A TAB-delimited file for an additional view to be used
in the interface. This file should contain all split
names, and values for each of them in all samples.
Each column in this file must correspond to a sample
name. Content of this file will be called 'user_view',
which will be available as a new item in the 'views'
combo box in the interface
-A ADDITIONAL_LAYERS, --additional-layers ADDITIONAL_LAYERS
A TAB-delimited file for additional layer info. In
our case this is info about each ASV. The first column
of the file must be the ASV names, and
the remaining columns should be unique attributes.
There are also a few files we generate that cannot be loaded directly. So, in addition to the files that can be loaded when running the interactive, we also have files that must be added to the database created by anvi’o.
Here is a nice tutorial on Working with anvi’o additional data tables. A lot of what we need is covered in this tutorial. To get the most out the visualization we need to create a few files to give anvi’o when we fire up the interactive interface.
- View data: in our case, a sample by ASV abundance matrix.
- Additional info about each ASV.
- Additional info about each sample.
- Taxa abundance data for each sample at some rank.
- Dendrograms ordering the ASVs and samples (based on view data).
- Fasta file of all ASVs in the analysis.
Main Steps
1. View data
Let’s start with the -d or --view-data file. This file needs to be an ASV by sample matrix of read counts. To simplify the visualization, we will use all ASVs represented by 100 or more total reads, including those identified as differentially abundant by the ISA and/or LEfSe.
Code
samp_ps_all <- c("ssu18_ps_filt", "ssu18_ps_pime", "ssu18_ps_perfect")
samp_ps <- c("ssu18_ps_filt", "ssu18_ps_pime", "ssu18_ps_perfect")
samp_ps_org <- c("ssu18_ps_work")
samp_ps_pime <- c("ssu18_ps_pime")
samp_ps_other <- c("ssu18_ps_filt", "ssu18_ps_perfect")
trim_val <- 100
for (i in samp_ps_pime) {
tmp_get <- get(i)
tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
trim_val <- 300
for (i in samp_ps_other) {
tmp_get <- get(i)
tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_trim")for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_df <- as.data.frame(t(otu_table(tmp_get)))
tmp_df <- tmp_df %>% rownames_to_column("Group")
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_df, paste(tmp_path, i, "/", "data.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}Or export a table of transformed data.
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_trans <- transform_sample_counts(tmp_get, function(x) 1e5 * {x/sum(x)})
tmp_df <- as.data.frame(t(otu_table(tmp_trans)))
tmp_df <- tmp_df %>% rownames_to_column("Group")
#tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim_tab"))
#assign(tmp_name, tmp_df)
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_df, paste(tmp_path, i, "/", "data_trans.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}2. Additional Layers for ASVs
Next, we need some additional data about the ASVs to overlay on the visual. This can be anything however what I specifically want are the details of the ISA analysis, total reads, and lineage info. I warn you; this code will get ugly and I urge you to find a better way.
Start with an ASV + lineage table for the ASVs in the new phyloseq object.
Code
for (i in samp_ps) {
tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_final")))
tmp_get_indval <- tmp_get_indval %>% dplyr::rename("Group" = "ASV_ID") %>%
dplyr::rename("enrich_indval" = "group") %>%
dplyr::rename("test_indval" = "indval") %>%
dplyr::rename("pval_indval" = "pval")
tmp_get_indval <- tmp_get_indval[,1:5]
tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_final")))
tmp_get_lefse <- tmp_get_lefse %>% dplyr::rename("Group" = "ASV_ID") %>%
dplyr::rename("enrich_lefse" = "group") %>%
dplyr::rename("test_lefse" = "lda") %>%
dplyr::rename("pval_lefse" = "pval")
tmp_get_lefse <- tmp_get_lefse[,1:4]
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_otu_df <- as.data.frame(t(otu_table(tmp_get)))
tmp_total <- cbind(tmp_otu_df, total_reads = rowSums(tmp_otu_df))
tmp_total <- rev(tmp_total)[1]
tmp_total <- tmp_total %>% tibble::rownames_to_column("Group")
tmp_tax_df <- as.data.frame(tax_table(tmp_get))
tmp_tax_df$ASV_SEQ <- NULL
tmp_tax_df$ASV_ID <- NULL
tmp_tax_df <- tmp_tax_df %>% tibble::rownames_to_column("Group")
tmp_add_lay <- dplyr::left_join(tmp_tax_df, tmp_total, by = "Group") %>%
dplyr::left_join(., tmp_get_indval, by = "Group") %>%
dplyr::left_join(., tmp_get_lefse, by = "Group")
tmp_add_lay$ASV_ID <- tmp_add_lay$Group
tmp_add_lay <- tmp_add_lay[, c(1,16,8,12,9:11,13:15,2:7)]
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_add_lay, paste(tmp_path, i, "/", "additional_layers.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}3. Additional Views for Samples
Now we want some general data about the samples to overlay on the visual. Again, this can be anything. How about a table of alpha diversity metrics? We actually have such a table that was generated way back up the road. Just need to fix the column names.
Added to profile.db with anvi-import-misc-data command & --target-data-table layers flag.
Code
metadata_tab <- read.table("files/metadata/tables/metadata.txt",
header = TRUE)
tmp_x <- readRDS("files/alpha/rdata/ssu18_ps_pime.rds")
data.frame(sample_data(tmp_x))
metadata_tab[,c(2:5)] <- list(NULL)
for (i in samp_ps_all) {
tmp_get <- get(i)
tmp_df <- data.frame(sample_data(tmp_get))
tmp_df <- tmp_df[,c(2:9)]
tmp_df <- tmp_df %>% tibble::rownames_to_column("id")
tmp_df <- tmp_df %>% dplyr::rename("no_asvs" = "Observed")
tmp_rc <- data.frame(readcount(tmp_get))
tmp_rc <- tmp_rc %>% tibble::rownames_to_column("id")
tmp_rc <- tmp_rc %>% dplyr::rename("no_reads" = 2)
#identical(tmp_df$id, tmp_rc$id)
tmp_merge <- dplyr::left_join(tmp_df, tmp_rc)
tmp_merge <- tmp_merge[, c(1:6,10,7:9)]
tmp_final <- dplyr::left_join(tmp_merge, metadata_tab)
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_final, paste(tmp_path, i, "/", "additional_views.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}
rm(metadata_tab)4. Taxon rank abundance by sample
Turned out this was a little tricky to figure out, but thanks to a little nifty block of code written by guoyanzhao on the phyloseq Issues forum, it was a piece of cake. The code can be altered to take any rank. See the post for an explanation.
Anyway, the goal is to sum each taxon at some rank and present that as a bar chart for each sample in the visualization. Anvi’o has a specific format it needs where each row is a sample and each column is a taxon. Taxa names need the prefix t_<RANK>!. For example, t_class! should be added for Class rank.
Added to profile.db with anvi-import-misc-data command & --target-data-table layers flag.
Code
pick_rank <- "Phylum"
pick_rank_l <- "phylum"
for (i in samp_ps_all) {
# Make the table
tmp_get <- get(i)
tmp_glom <- tax_glom(tmp_get, taxrank = pick_rank)
tmp_melt <- psmelt(tmp_glom)
tmp_melt[[pick_rank]] <- as.character(tmp_melt[[pick_rank]])
tmp_abund <- aggregate(Abundance ~ Sample + tmp_melt[[pick_rank]], tmp_melt, FUN = sum)
colnames(tmp_abund)[2] <- "tax_rank"
library(reshape2)
tmp_abund <- as.data.frame(reshape::cast(tmp_abund, Sample ~ tax_rank))
tmp_abund <- tibble::remove_rownames(tmp_abund)
tmp_abund <- tibble::column_to_rownames(tmp_abund, "Sample")
# Reorder table column by sum
tmp_layers <- tmp_abund[,names(sort(colSums(tmp_abund), decreasing = TRUE))]
# Add the prefix
tmp_layers <- tmp_layers %>% dplyr::rename_all(function(x) paste0("t_", pick_rank_l,"!", x))
tmp_layers <- tibble::rownames_to_column (tmp_layers, "taxon")
# save the table
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_layers, paste(tmp_path, i, "/", "tax_layers.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}5. Construct Dendrograms
The last piece we need is to generate dendrograms that order the ASVs by their distribution in the samples and the samples by their ASV composition. For this task we will use anvi’o.
anvi-matrix-to-newick data.txt --distance euclidean \
--linkage ward \
-o asv.tre
anvi-matrix-to-newick data.txt --distance euclidean \
--linkage ward \
-o sample.tre \
--transposeThe first command reads the view data we generated above and uses Euclidean distance and Ward linkage for hierarchical clustering of the ASVs. The second command transposes the view data table and then does the same for the samples. There are several distance metrics and linkage methods available. See the help menu for the command by typing anvi-matrix-to-newick -h. Boom.
Alternatively, we can generate dendrograms using phyloseq::distance and hclust.
Code
pick_dist <- "bray"
pick_clust <- "complete"
for (i in samp_ps) {
# Make the table
tmp_get <- get(i)
tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist, type = "sample")
tmp_dend <- hclust(tmp_dist, method = pick_clust)
plot(tmp_dend, hang = -1)
tmp_tree <- as.phylo(tmp_dend)
tmp_path <- file.path("files/anvio/ssu/")
write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
rm(list = ls(pattern = "tmp_"))
}Code
pick_dist_asv <- "euclidean"
pick_clust_asv <- "ward"
for (i in samp_ps) {
# Make the table
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist_asv, type = "taxa")
tmp_dend <- hclust(tmp_dist, method = pick_clust_asv)
plot(tmp_dend, hang = -1)
tmp_tree <- as.phylo(tmp_dend)
tmp_path <- file.path("files/anvio/ssu/")
write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "asv_", pick_dist_asv, "_", pick_clust_asv, ".tre", sep = ""))
rm(list = ls(pattern = "tmp_"))
}The file asv.tre is loaded with anvi-interactive command & the --tree flag.
The ASV tree is fine as is, but the sample tree needs a special format. Specifically, the tree needs to be in a three column, tab delimited, table. This way you can add multiple orderings to the same file and view them all in the interactive. The table needs to be in this format:
| item_name | data_type | data_value |
|---|---|---|
| tree_1 | newick | ((P01_D00_010_W8A:0.0250122,P05_D00_010_W8C:0.02,.. |
| tree_2 | newick | ((((((((OTU14195:0.0712585,OTU13230:0.0712585)0:,… |
| (…) | (…) | (…) |
This is easy to do by hand, but I really need the practice so I will do it in R. Anvi’o is very particular about formatting. For example, if this file ends with a blank line, which it will because when anvi’o made the initial dendrogram it add a new line. We need to get rid of that or we get an error when trying to import the table.
The file sample.tre is added to the profile.db with anvi-import-misc-data command & --target-data-table layer_orders flag.
Code
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/ssu/")
tmp_tree <- read_file(paste(tmp_path, i, "/", "sample.tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c("bray_complete")
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample.tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}Code
# FOR TRANSFORMED DATA
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/ssu/")
tmp_tree <- read_file(paste(tmp_path, i, "/","sample_trans.tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c("bray_complete")
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample_trans.tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}
objects()Code
# FOR HCLUST TREE
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/ssu/")
tmp_tree <- read_file(paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c(paste(pick_dist, "_", pick_clust, "_hclust", sep = ""))
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}6. Make a fasta file
We don’t need to add a fasta file, but it is a nice way to keep everything in one place. Plus, you can do BLAST searches directly in the interface by right clicking on the ASV of interest, so it is nice to have the sequences.
Loaded with anvi-interactive command & --fasta-file flag.
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_tab <- tax_table(tmp_get)
tmp_tab <- tmp_tab[, 7]
tmp_df <- data.frame(row.names(tmp_tab), tmp_tab)
colnames(tmp_df) <- c("ASV_ID", "ASV_SEQ")
tmp_df$ASV_ID <- sub("^", ">", tmp_df$ASV_ID)
tmp_path <- file.path("files/anvio/ssu/")
write.table(tmp_df, paste(tmp_path, i, "/", i, ".fasta", sep = ""),
sep = "\n", col.names = FALSE, row.names = FALSE,
quote = FALSE, fileEncoding = "UTF-8")
rm(list = ls(pattern = "tmp_"))
}Building the Profile Database
Time to put all of these pieces together. This gets a little tricky since we do not have a database to start with because some of these files can be loaded directly in the interface but some need to be added to a database. When we fire up the interactive in --manual mode, we must give anvi’o the name of a database and it will create that database for us. Then we can shut down the interactive, add the necessary data files, and start back up.
anvi-interactive --view-data data.txt \
--tree asv.tre \
--additional-layers additional_layers.txt \
--profile-db profile.db \
--manualNow we have a new profile database that we can add the sample metadata (additional_layers.txt) and the sample dendrogram (sample.tre) using the command anvi-import-misc-data. These commands add the table to the new profile.db. First, kill the interactive.
anvi-import-misc-data additional_views.txt \
--pan-or-profile-db profile.db \
--target-data-table layers
anvi-import-misc-data sample.tre \
--pan-or-profile-db profile.db \
--target-data-table layer_ordersOne last this is to get the table with the taxonomy total by sample (tax_layers.txt) into the profile database. We will run the same command we just used.
anvi-import-misc-data tax_layers.txt \
--pan-or-profile-db profile.db \
--target-data-table layersIn fact, we could just as easily append the taxonomy total data onto the additional_layers.txt and import in one command. But we didn’t.
Interactive Interface
With a populated database in hand, we can now begin modifying the visual by running the interactive command again.
anvi-interactive --view-data data.txt \
--tree asv.tre \
--additional-layers additional_layers.txt \
--profile-db profile.db
--fasta-file anvio.fasta \
--manualClick here for the ITS version of this workflow.
The ITS version of the anvi’o workflow is basically a carbon copy of the workflow presented above. It is included here for posterity.
Visualizing DA ASVs in Anvi’o
Next, we will combine the results of the ISA and LEfSe analyses with the distribution of ASVs across each sample. We are going to do the analysis in anvi’o—an advanced analysis and visualization platform for ’omics data (Eren et al. 2015)—using the anvi-interactive command. Anvi’o likes databases but it also understands that sometimes you do not have a database. So it offers a manual mode. If you type this command you can have a look at the relevant pieces we need for the visualization, specifically those under the headings MANUAL INPUTS and ADDITIONAL STUFF.
anvi-interactive -hMANUAL INPUTS:
Mandatory input parameters to start the interactive interface without
anvi'o databases.
--manual-mode We need this flag to run anvi'o in an ad hoc
manner, i.e., no database.
-f FASTA, --fasta-file FASTA
A FASTA-formatted input file. This is sort of
optional
-d VIEW_DATA, --view-data VIEW_DATA
A TAB-delimited file for view data. This is the ASV
by sample matrix. We need this
-t NEWICK, --tree NEWICK
NEWICK formatted tree structure. How the ASVs are
ordered in our case.
ADDITIONAL STUFF:
Parameters to provide additional layers, views, or layer data.
-V ADDITIONAL_VIEW, --additional-view ADDITIONAL_VIEW
A TAB-delimited file for an additional view to be used
in the interface. This file should contain all split
names, and values for each of them in all samples.
Each column in this file must correspond to a sample
name. Content of this file will be called 'user_view',
which will be available as a new item in the 'views'
combo box in the interface
-A ADDITIONAL_LAYERS, --additional-layers ADDITIONAL_LAYERS
A TAB-delimited file for additional layer info. In
our case this is info about each ASV. The first column
of the file must be the ASV names, and
the remaining columns should be unique attributes.
There are also a few files we generate that cannot be loaded directly. So, in addition to the files that can be loaded when running the interactive, we also have files that must be added to the database created by anvi’o.
Here is a nice tutorial on Working with anvi’o additional data tables. A lot of what we need is covered in this tutorial. To get the most out the visualization we need to create a few files to give anvi’o when we fire up the interactive interface.
- View data: in our case, a sample by ASV abundance matrix.
- Additional info about each ASV.
- Additional info about each sample.
- Taxa abundance data for each sample at some rank.
- Dendrograms ordering the ASVs and samples (based on view data).
- Fasta file of all ASVs in the analysis.
Main steps
1. View data
Let’s start with the -d or --view-data file. This file needs to be an ASV by sample matrix of read counts. To simplify the visualization, we will use all ASVs represented by 100 or more total reads, including those identified as differentially abundant by the ISA and/or LEfSe.
Code
samp_ps_all <- c("its18_ps_filt", "its18_ps_pime", "its18_ps_perfect",
"its18_ps_filt_otu", "its18_ps_pime_otu", "its18_ps_perfect_otu",
"its18_ps_work", "its18_ps_work_otu")
samp_ps <- c("its18_ps_filt", "its18_ps_pime", "its18_ps_perfect",
"its18_ps_filt_otu", "its18_ps_pime_otu", "its18_ps_perfect_otu")
samp_ps_org <- c("its18_ps_work", "its18_ps_work_otu")
samp_ps_pime <- c("its18_ps_pime", "its18_ps_pime_otu")
samp_ps_other <- c("its18_ps_filt", "its18_ps_perfect",
"its18_ps_filt_otu", "its18_ps_perfect_otu")
trim_val <- 50
for (i in samp_ps_pime) {
tmp_get <- get(i)
tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
trim_val <- 300
for (i in samp_ps_other) {
tmp_get <- get(i)
tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_trim")
its18_ps_filt_trim
its18_ps_pime_trim
its18_ps_perfect_trim
its18_ps_filt_otu_trim
its18_ps_pime_otu_trim
its18_ps_perfect_otu_trimCode
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_df <- as.data.frame(t(otu_table(tmp_get)))
tmp_df <- tmp_df %>% rownames_to_column("Group")
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_df, paste(tmp_path, i, "/", "data.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}
objects()Or export a table of transformed data.
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_trans <- transform_sample_counts(tmp_get, function(x) 1e5 * {x/sum(x)})
tmp_df <- as.data.frame(t(otu_table(tmp_trans)))
tmp_df <- tmp_df %>% rownames_to_column("Group")
#tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim_tab"))
#assign(tmp_name, tmp_df)
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_df, paste(tmp_path, i, "/", "data_trans.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}
objects()2. Additional Layers for ASVs
Next, we need some additional data about the ASVs to overlay on the visual. This can be anything however what I specifically want are the details of the ISA analysis, total reads, and lineage info. I warn you; this code will get ugly and I urge you to find a better way.
Start with an ASV + lineage table for the ASVs in the new phyloseq object.
Code
for (i in samp_ps) {
tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_final")))
tmp_get_indval <- tmp_get_indval %>% dplyr::rename("Group" = "ASV_ID") %>%
dplyr::rename("enrich_indval" = "group") %>%
dplyr::rename("test_indval" = "indval") %>%
dplyr::rename("pval_indval" = "pval")
tmp_get_indval <- tmp_get_indval[,1:5]
tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_final")))
tmp_get_lefse <- tmp_get_lefse %>% dplyr::rename("Group" = "ASV_ID") %>%
dplyr::rename("enrich_lefse" = "group") %>%
dplyr::rename("test_lefse" = "lda") %>%
dplyr::rename("pval_lefse" = "pval")
tmp_get_lefse <- tmp_get_lefse[,1:4]
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_otu_df <- as.data.frame(t(otu_table(tmp_get)))
tmp_total <- cbind(tmp_otu_df, total_reads = rowSums(tmp_otu_df))
tmp_total <- rev(tmp_total)[1]
tmp_total <- tmp_total %>% tibble::rownames_to_column("Group")
tmp_tax_df <- as.data.frame(tax_table(tmp_get))
tmp_tax_df$ASV_SEQ <- NULL
tmp_tax_df$ASV_ID <- NULL
tmp_tax_df <- tmp_tax_df %>% tibble::rownames_to_column("Group")
tmp_add_lay <- dplyr::left_join(tmp_tax_df, tmp_total, by = "Group") %>%
dplyr::left_join(., tmp_get_indval, by = "Group") %>%
dplyr::left_join(., tmp_get_lefse, by = "Group")
tmp_add_lay$ASV_ID <- tmp_add_lay$Group
tmp_add_lay <- tmp_add_lay[, c(1,16,8,12,9:11,13:15,2:7)]
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_add_lay, paste(tmp_path, i, "/", "additional_layers.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}3. Additional Views for Samples
Now we want some general data about the samples to overlay on the visual. Again, this can be anything. How about a table of alpha diversity metrics? We actually have such a table that was generated way back up the road. Just need to fix the column names.
Added to profile.db with anvi-import-misc-data command & --target-data-table layers flag.
Code
metadata_tab <- read.table("files/metadata/tables/metadata.txt",
header = TRUE)
tmp_x <- readRDS("files/alpha/rdata/its18_ps_pime.rds")
data.frame(sample_data(tmp_x))
metadata_tab[,c(2:5)] <- list(NULL)
for (i in samp_ps_all) {
tmp_get <- get(i)
tmp_df <- data.frame(sample_data(tmp_get))
tmp_df <- tmp_df[,c(2:9)]
tmp_df <- tmp_df %>% tibble::rownames_to_column("id")
tmp_df <- tmp_df %>% dplyr::rename("no_asvs" = "Observed")
tmp_rc <- data.frame(readcount(tmp_get))
tmp_rc <- tmp_rc %>% tibble::rownames_to_column("id")
tmp_rc <- tmp_rc %>% dplyr::rename("no_reads" = 2)
#identical(tmp_df$id, tmp_rc$id)
tmp_merge <- dplyr::left_join(tmp_df, tmp_rc)
tmp_merge <- tmp_merge[, c(1:6,10,7:9)]
tmp_final <- dplyr::left_join(tmp_merge, metadata_tab)
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_final, paste(tmp_path, i, "/", "additional_views.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}
rm(metadata_tab)4. Taxon rank abundance by sample
Turned out this was a little tricky to figure out, but thanks to a little nifty block of code written by guoyanzhao on the phyloseq Iitses forum, it was a piece of cake. The code can be altered to take any rank. See the post for an explanation.
Anyway, the goal is to sum each taxon at some rank and present that as a bar chart for each sample in the visualization. Anvi’o has a specific format it needs where each row is a sample and each column is a taxon. Taxa names need the prefix t_<RANK>!. For example, t_class! should be added for Class rank.
Added to profile.db with anvi-import-misc-data command & --target-data-table layers flag.
Code
pick_rank <- "Order"
pick_rank_l <- "order"
for (i in samp_ps_all) {
# Make the table
tmp_get <- get(i)
tmp_glom <- tax_glom(tmp_get, taxrank = pick_rank)
tmp_melt <- psmelt(tmp_glom)
tmp_melt[[pick_rank]] <- as.character(tmp_melt[[pick_rank]])
tmp_abund <- aggregate(Abundance ~ Sample + tmp_melt[[pick_rank]], tmp_melt, FUN = sum)
colnames(tmp_abund)[2] <- "tax_rank"
library(reshape2)
tmp_abund <- as.data.frame(reshape::cast(tmp_abund, Sample ~ tax_rank))
tmp_abund <- tibble::remove_rownames(tmp_abund)
tmp_abund <- tibble::column_to_rownames(tmp_abund, "Sample")
# Reorder table column by sum
tmp_layers <- tmp_abund[,names(sort(colSums(tmp_abund), decreasing = TRUE))]
# Add the prefix
tmp_layers <- tmp_layers %>% dplyr::rename_all(function(x) paste0("t_", pick_rank_l,"!", x))
tmp_layers <- tibble::rownames_to_column (tmp_layers, "taxon")
# save the table
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_layers, paste(tmp_path, i, "/", "tax_layers.txt", sep = ""),
quote = FALSE, sep = "\t", row.names = FALSE)
rm(list = ls(pattern = "tmp_"))
}5. Construct Dendrograms
The last piece we need is to generate dendrograms that order the ASVs by their distribution in the samples and the samples by their ASV composition. For this task we will use anvi’o.
anvi-matrix-to-newick data.txt --distance euclidean \
--linkage ward \
-o asv.tre
anvi-matrix-to-newick data.txt --distance euclidean \
--linkage ward \
-o sample.tre \
--transposeThe first command reads the view data we generated above and uses Euclidean distance and Ward linkage for hierarchical clustering of the ASVs. The second command transposes the view data table and then does the same for the samples. There are several distance metrics and linkage methods available. See the help menu for the command by typing anvi-matrix-to-newick -h. Boom.
Alternatively, we can generate dendrograms using phyloseq::distance and hclust.
Code
pick_dist <- "bray"
pick_clust <- "complete"
for (i in samp_ps) {
# Make the table
tmp_get <- get(i)
tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist, type = "sample")
tmp_dend <- hclust(tmp_dist, method = pick_clust)
plot(tmp_dend, hang = -1)
tmp_tree <- as.phylo(tmp_dend)
tmp_path <- file.path("files/anvio/its/")
write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
rm(list = ls(pattern = "tmp_"))
}Code
pick_dist_asv <- "euclidean"
pick_clust_asv <- "ward"
for (i in samp_ps) {
# Make the table
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist_asv, type = "taxa")
tmp_dend <- hclust(tmp_dist, method = pick_clust_asv)
plot(tmp_dend, hang = -1)
tmp_tree <- as.phylo(tmp_dend)
tmp_path <- file.path("files/anvio/its/")
write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "asv_", pick_dist_asv, "_", pick_clust_asv, ".tre", sep = ""))
rm(list = ls(pattern = "tmp_"))
}The file asv.tre is loaded with anvi-interactive command & the --tree flag.
The ASV tree is fine as is, but the sample tree needs a special format. Specifically, the tree needs to be in a three column, tab delimited, table. This way you can add multiple orderings to the same file and view them all in the interactive. The table needs to be in this format:
| item_name | data_type | data_value |
|---|---|---|
| tree_1 | newick | ((P01_D00_010_W8A:0.0250122,P05_D00_010_W8C:0.02,.. |
| tree_2 | newick | ((((((((OTU14195:0.0712585,OTU13230:0.0712585)0:,… |
| (…) | (…) | (…) |
This is easy to do by hand, but I really need the practice so I will do it in R. Anvi’o is very particular about formatting. For example, if this file ends with a blank line, which it will because when anvi’o made the initial dendrogram it add a new line. We need to get rid of that or we get an error when trying to import the table.
The file sample.tre is added to the profile.db with anvi-import-misc-data command & --target-data-table layer_orders flag.
Code
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/its/")
tmp_tree <- read_file(paste(tmp_path, i, "/", "sample.tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c("bray_complete")
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample.tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}Code
# FOR TRANSFORMED DATA
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/its/")
tmp_tree <- read_file(paste(tmp_path, i, "/","sample_trans.tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c("bray_complete")
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample_trans.tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}
objects()Code
# FOR HCLUST TREE
for (i in samp_ps) {
tmp_path <- file.path("files/anvio/its/")
tmp_tree <- read_file(paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
tmp_tree <- gsub("[\r\n]", "", tmp_tree)
tmp_item <- c(paste(pick_dist, "_", pick_clust, "_hclust", sep = ""))
tmp_type <- c("newick")
tmp_df <- c(tmp_tree)
tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
library(janitor)
tmp_tab %>% remove_empty("rows")
colnames(tmp_tab) <- c("item_name", "data_type", "data_value")
write.table(tmp_tab, paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""),
sep = "\t", quote = FALSE, row.names = FALSE, na = "")
rm(list = ls(pattern = "tmp_"))
}6. Make a fasta file
We don’t need to add a fasta file, but it is a nice way to keep everything in one place. Plus, you can do BLAST searches directly in the interface by right clicking on the ASV of interest, so it is nice to have the sequences.
Loaded with anvi-interactive command & --fasta-file flag.
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
tmp_tab <- tax_table(tmp_get)
tmp_tab <- tmp_tab[, 7]
tmp_df <- data.frame(row.names(tmp_tab), tmp_tab)
colnames(tmp_df) <- c("ASV_ID", "ASV_SEQ")
tmp_df$ASV_ID <- sub("^", ">", tmp_df$ASV_ID)
tmp_path <- file.path("files/anvio/its/")
write.table(tmp_df, paste(tmp_path, i, "/", i, ".fasta", sep = ""),
sep = "\n", col.names = FALSE, row.names = FALSE,
quote = FALSE, fileEncoding = "UTF-8")
rm(list = ls(pattern = "tmp_"))
}Building the Profile Database
Time to put all of these pieces together. This gets a little tricky since we do not have a database to start with because some of these files can be loaded directly in the interface but some need to be added to a database. When we fire up the interactive in --manual mode, we must give anvi’o the name of a database and it will create that database for us. Then we can shut down the interactive, add the necessary data files, and start back up.
anvi-interactive --view-data data.txt \
--tree asv.tre \
--additional-layers additional_layers.txt \
--profile-db profile.db \
--manualNow we have a new profile database that we can add the sample metadata (additional_layers.txt) and the sample dendrogram (sample.tre) using the command anvi-import-misc-data. These commands add the table to the new profile.db. First, kill the interactive.
anvi-import-misc-data additional_views.txt \
--pan-or-profile-db profile.db \
--target-data-table layers
anvi-import-misc-data sample.tre \
--pan-or-profile-db profile.db \
--target-data-table layer_ordersOne last this is to get the table with the taxonomy total by sample (tax_layers.txt) into the profile database. We will run the same command we just used.
anvi-import-misc-data tax_layers.txt \
--pan-or-profile-db profile.db \
--target-data-table layersIn fact, we could just as easily append the taxonomy total data onto the additional_layers.txt and import in one command. But we didn’t.
Interactive Interface
With a populated database in hand, we can now begin modifying the visual by running the interactive command again.
anvi-interactive --view-data data.txt \
--tree asv.tre \
--additional-layers additional_layers.txt \
--profile-db profile.db
--fasta-file anvio.fasta \
--manualWorkflow Output
Data products generated in this workflow can be downloaded from figshare.
Next workflow:
8. Metadata Analysis Previous workflow:
6. Beta Diversity & Dispersion Estimates
Source Code
The source code for this page can be accessed on GitHub by clicking this link.
Data Availability
Data generated in this workflow and the Rdata needed to run the workflow can be accessed on figshare at 10.25573/data.16828249.
Last updated on
[1] "2022-06-21 13:49:42 EST"